# Basic Libraries
import numpy as np
import pandas as pd
from warnings import filterwarnings
from collections import Counter
# Visualizations Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.offline as pyo
import plotly.express as px
import plotly.graph_objs as go
pyo.init_notebook_mode()
import plotly.figure_factory as ff
import missingno as msno
# Data Pre-processing Libraries
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
# Modelling Libraries
from sklearn.linear_model import LogisticRegression,RidgeClassifier,SGDClassifier,PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC,LinearSVC,NuSVC
from sklearn.neighbors import KNeighborsClassifier,NearestCentroid
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.ensemble import VotingClassifier
# Evaluation & CV Libraries
from sklearn.metrics import precision_score,accuracy_score
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV,RepeatedStratifiedKFold
pd.read_csv('water_potability.csv')
df=pd.read_csv('water_potability.csv')
df
df.head()
df.describe()
df.info()
colors_blue = ["#132C33", "#264D58", '#17869E', '#51C4D3', '#B4DBE9']
colors_dark = ["#1F1F1F", "#313131", '#636363', '#AEAEAE', '#DADADA']
colors_green = ['#01411C','#4B6F44','#4F7942','#74C365','#D0F0C0']
sns.palplot(colors_blue)
sns.palplot(colors_green)
sns.palplot(colors_dark)
d= pd.DataFrame(df['Potability'].value_counts())
fig = px.pie(d,values='Potability',names=['Not Potable','Potable'],hole=0.4,opacity=0.6,
color_discrete_sequence=[colors_green[3],colors_blue[3]],
labels={'label':'Potability','Potability':'No. Of Samples'})
fig.add_annotation(text='We can resample the data<br> to get a balanced dataset',
x=1.2,y=0.9,showarrow=False,font_size=12,opacity=0.7,font_family='monospace')
fig.add_annotation(text='Potability',
x=0.5,y=0.5,showarrow=False,font_size=14,opacity=0.7,font_family='monospace')
fig.update_layout(
font_family='monospace',
title=dict(text='Q. How many samples of water are Potable?',x=0.47,y=0.98,
font=dict(color=colors_dark[2],size=20)),
legend=dict(x=0.37,y=-0.05,orientation='h',traceorder='reversed'),
hoverlabel=dict(bgcolor='white'))
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.show()
#Hardenss of water:
#The simple definition of water hardness is the amount of dissolved calcium and magnesium in the water.
#Hard water is high in dissolved minerals, largely calcium and magnesium. You may have felt the effects of hard water,
#literally, the last time you washed your hands. Depending on the hardness of your water, after using soap to wash you may
#have felt like there was a film of residue left on your hands. In hard water, soap reacts with the calcium
#(which is relatively high in hard water) to form "soap scum". When using hard water, more soap or detergent is needed
#to get things clean, be it your hands, hair, or your laundry.#Hardenss of water: The simple definition of water hardness
#is the amount of dissolved calcium and magnesium in the water. Hard water is high in dissolved minerals, largely calcium and
#magnesium. You may have felt the effects of hard water, literally, the last time you washed your hands.
#Depending on the hardness of your water, after using soap to wash you may have felt like there was a film of residue left
#on your hands. In hard water, soap reacts with the calcium (which is relatively high in hard water) to form "soap scum".
#When using hard water, more soap or detergent is needed to get things clean, be it your hands, hair, or your laundry.
fig = px.histogram(df, x='Hardness', y=Counter(df['Hardness']), color='Potability', template='plotly_white',
marginal='box',opacity=.7, nbins=100, color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group', histfunc='count')
fig.add_vline(x=151, line_width=1, line_color=colors_dark[1],line_dash='dot', opacity=.7)
fig.add_vline(x=301, line_width=1, line_color=colors_dark[1],line_dash='dot', opacity=.7)
fig.add_vline(x=76, line_width=1, line_color=colors_dark[1],line_dash='dot', opacity=.7)
fig.add_annotation(text='<76 mg/l is <br> considered soft', x=40, y=130, showarrow=False, font_size=9)
fig.add_annotation(text='Between 76 and 150 <br> (mg/L is considered <br> moderately hard', x=113, y=130, showarrow=False, font_size=9)
fig.add_annotation(text='Between 151 and 300(mg/L)<br> is considered Hard', x=250, y=130, showarrow=False, font_size=9)
fig.add_annotation(text='>300 mg/L is <br> considered very Hard', x=340, y=130, showarrow=False, font_size=9)
fig.update_layout(
font_family='monospace',
title=dict(text='Hard Distrisbution', x=.53, y=.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Hardness (mg/L)',
yaxis_title_text='Count',
legend=dict(x=1, y=.96, bordercolor=colors_dark[4],borderwidth=0, tracegroupgap=5),
bargap=.3
)
#pH level:
#The pH of water is a measure of the acid–base equilibrium and, in most natural waters,
#is controlled by the carbon dioxide–bicarbonate–carbonate equilibrium system. An increased carbon dioxide concentration
#will therefore lower pH, whereas a decrease will cause it to rise. Temperature will also affect the equilibria and the pH.
#In pure water, a decrease in pH of about 0.45 occurs as the temperature is raised by 25 °C. In water with a buffering capacity
#imparted by bicarbonate, carbonate and hydroxyl ions, this temperature effect is modified (APHA, 1989).
#The pH of most drinking-water lies within the range 6.5–8.5.
#Natural waters can be of lower pH, as a result of, for example, acid rain or higher pH in limestone areas.
fig = px.histogram(df,x='ph',y=Counter(df['ph']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_vline(x=7, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_annotation(text='<7 is Acidic',x=4,y=70,showarrow=False,font_size=10)
fig.add_annotation(text='>7 is Basic',x=10,y=70,showarrow=False,font_size=10)
fig.update_layout(
font_family='monospace',
title=dict(text='pH Level Distribution',x=0.5,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='pH Level',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
#TDS: TDS means concentration of dissolved particles or solids in water.
#TDS comprises of inorganic salts such as calcium, magnesium, chlorides, sulfates, bicarbonates, etc,
#along with many more inorganic compounds that easily dissolve in water.
fig2 =px.histogram(df, x='Solids', y=Counter(df['Solids']),color = 'Potability', template='plotly_white',
marginal='box', opacity=.7, nbins=100, color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group', histfunc='count')
fig2.update_layout(
font_family='monospace',
title=dict(text="Distribution of Total Dissolved Solids", x=.5, y=.95, font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Dissolved Solids(ppm)',
yaxis_title_text='Count',
legend=dict(x=1, y=.96, bordercolor=colors_dark[4] ,borderwidth=0, tracegroupgap=5),
bargap=.3)
#Sulfate: Sulfate (SO4) can be found in almost all natural water.
#The origin of most sulfate compounds is the oxidation of sulfite ores, the presence of shales,
#or the industrial wastes. Sulfate is one of the major dissolved components of rain.
#High concentrations of sulfate in the water we drink can have a laxative effect when combined with calcium and magnesium,
#the two most common constituents of hardness.
fig = px.histogram(df,x='Sulfate',y=Counter(df['Sulfate']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_vline(x=250, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_annotation(text='<250 mg/L is considered<br> safe for drinking',x=175,y=90,showarrow=False)
fig.update_layout(
font_family='monospace',
title=dict(text='Sulfate Distribution',x=0.53,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Sulfate (mg/L)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
#Chloramines: Chloramines (also known as secondary disinfection) are disinfectants used to treat drinking water and they:
#Are most commonly formed when ammonia is added to chlorine to treat drinking water.
#Provide longer-lasting disinfection as the water moves through pipes to consumers.
#Chloramines have been used by water utilities since the 1930s.
fig3 = px.histogram(df,x='Chloramines',y=Counter(df['Chloramines']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig3.add_vline(x=4, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig3.add_annotation(text='<4 ppm is considered<br> safe for drinking',x=1.8,y=90,showarrow=False)
fig3.update_layout(
font_family='monospace',
title=dict(text='Chloramines Distribution',x=0.53,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Chloramines (ppm)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig3.show()
fig4 = px.histogram(df,x='Sulfate',y=Counter(df['Sulfate']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig4.add_vline(x=250, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig4.add_annotation(text='<250 mg/L is considered<br> safe for drinking',x=175,y=90,showarrow=False)
fig4.update_layout(
font_family='monospace',
title=dict(text='Sulfate Distribution',x=0.53,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Sulfate (mg/L)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig4.show()
#Conductivity: Conductivity is a measure of the ability of water to pass an electrical current.
#Because dissolved salts and other inorganic chemicals conduct electrical current, conductivity increases as salinity increases.
#Organic compounds like oil do not conduct electrical current very well and therefore have a low conductivity when in water.
#Conductivity is also affected by temperature: the warmer the water, the higher the conductivity.
fig = px.histogram(df,x='Conductivity',y=Counter(df['Conductivity']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_annotation(text='The Conductivity range <br> is safe for both (200-800),<br> Potable and Non-Potable water',
x=600,y=90,showarrow=False)
fig.update_layout(
font_family='monospace',
title=dict(text='Conductivity Distribution',x=0.5,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Conductivity (μS/cm)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
#Organic Carbon: Organic contaminants (natural organic substances, insecticides, herbicides, and other agricultural chemicals)
#enter waterways in rainfall runoff. Domestic and industrial wastewaters also contribute organic contaminants in various amounts.
#As a result of accidental spills or leaks, industrial organic wastes may enter streams. Some of the contaminants may not be
#completely removed by treatment processes; therefore, they could become a problem for drinking water sources.
#It is important to know the organic content in a waterway.
fig = px.histogram(df,x='Organic_carbon',y=Counter(df['Organic_carbon']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_vline(x=10, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_annotation(text='Typical Organic Carbon<br> level is upto 10 ppm',x=5.3,y=110,showarrow=False)
fig.update_layout(
font_family='monospace',
title=dict(text='Organic Carbon Distribution',x=0.5,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Organic Carbon (ppm)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
#Trihalomethanes: Trihalomethanes (THMs) are the result of a reaction between the chlorine used for disinfecting tap water and
#natural organic matter in the water.
#At elevated levels, THMs have been associated with negative health effects such as cancer and adverse reproductive outcomes.
fig = px.histogram(df,x='Trihalomethanes',y=Counter(df['Trihalomethanes']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_vline(x=80, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_annotation(text='Upper limit of Trihalomethanes<br> level is 80 μg/L',x=115,y=90,showarrow=False)
fig.update_layout(
font_family='monospace',
title=dict(text='Trihalomethanes Distribution',x=0.5,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Trihalomethanes (μg/L)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
#Turbidity: Turbidity is the measure of relative clarity of a liquid.
#It is an optical characteristic of water and is a measurement of the amount of light that is scattered by material in the water
#when a light is shined through the water sample. The higher the intensity of scattered light, the higher the turbidity.
#Material that causes water to be turbid include clay, silt, very tiny inorganic and organic matter, algae,
#dissolved colored organic compounds, and plankton and other microscopic organisms.
fig = px.histogram(df,x='Turbidity',y=Counter(df['Turbidity']),color='Potability',template='plotly_white',
marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
barmode='group',histfunc='count')
fig.add_vline(x=5, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)
fig.add_annotation(text='<5 NTU Turbidity is<br> considered safe',x=6,y=90,showarrow=False)
fig.update_layout(
font_family='monospace',
title=dict(text='Turbidity Distribution',x=0.5,y=0.95,
font=dict(color=colors_dark[2],size=20)),
xaxis_title_text='Turbidity (NTU)',
yaxis_title_text='Count',
legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
bargap=0.3,
)
fig.show()
fig = px.scatter_matrix(df,df.drop('Potability',axis=1),height=1250,width=1250,template='plotly_white',opacity=0.7,
color_discrete_sequence=[colors_blue[3],colors_green[3]],color='Potability',
symbol='Potability',color_continuous_scale=[colors_green[3],colors_blue[3]])
fig.update_layout(font_family='monospace',font_size=10,
coloraxis_showscale=False,
legend=dict(x=0.02,y=1.07,bgcolor=colors_dark[4]),
title=dict(text='Scatter Plot Matrix b/w Features',x=0.5,y=0.97,
font=dict(color=colors_dark[2],size=24)))
fig.show()
cor=df.drop('Potability',axis=1).corr()
cor
fig = px.imshow(cor,height=800,width=800,color_continuous_scale=colors_blue,template='plotly_white')
fig.update_layout(font_family='monospace',
title=dict(text='Correlation Heatmap',x=0.5,y=0.93,
font=dict(color=colors_dark[2],size=24)),
coloraxis_colorbar=dict(len=0.85,x=1.1)
)
fig.show()
#dealing with missing values
fig = msno.matrix(df,color=(0,0.5,0.5))
df.isnull().sum()
df[df['Potability']==0].describe()
df[df['Potability']==1].describe()
df[df['Potability']==0][['ph','Sulfate','Trihalomethanes']].median()
df[df['Potability']==0][['ph','Sulfate','Trihalomethanes']].mean()
# the mean and median is nearly identical
df['ph'].fillna(value=df['ph'].median(),inplace=True)
df['Sulfate'].fillna(value=df['Sulfate'].median(),inplace=True)
df['Trihalomethanes'].fillna(value=df['Trihalomethanes'].median(),inplace=True)
#filling all null values with the median value.
df.isnull().sum()
X = df.drop('Potability',axis=1).values
X
y = df['Potability'].values
y
df
X_train, X_test, y_train, y_test =train_test_split(X,y, test_size=.3, random_state=101)
scaler=StandardScaler()
scaler.fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)
filterwarnings('ignore')
models=[("LR", LogisticRegression(max_iter=1000)), ("SVC",SVC()),("KNN", KNeighborsClassifier(n_neighbors=10)),
("DTC",DecisionTreeClassifier()), ("GNB",GaussianNB()),("SGDC", SGDClassifier()), ("Perc", Perceptron()),
("NC", NearestCentroid()), ("Ridge",RidgeClassifier()), ("BNB",BernoulliNB()), ("RF", RandomForestClassifier()),
("ADA", AdaBoostClassifier()), ("XGB", GradientBoostingClassifier()), ('PAC', PassiveAggressiveClassifier())]
results= []
names = []
finalresults=[]
for name, model in models:
model.fit(X_train, y_train)
model_results=model.predict(X_test)
score=precision_score(y_test, model_results, average='macro')
results.append(score)
names.append(name)
finalresults.append((name,score))
finalresults.sort(key=lambda k:k[1], reverse=True)
finalresults
model_params = {
'XGB': {
'model': GradientBoostingClassifier(),
'params': {
'learning_rate' : [.0001,.001, .01, .1],
'n_estimators' : [100,200,500,1000],
'max_features': ['sqrt','log2'],
'max_depth':list(range(11))
}
},
'Random Forest':
{
'model':RandomForestClassifier(),
'params':
{
'n_estimators':[10,50,100,200],
'max_features':['auto','sqrt','log2'],
'max_depth':list(range(1,11))
}
}
}
cv= RepeatedStratifiedKFold(n_splits=5, n_repeats=2)
scores=[]
for model_name, params in model_params.items():
rs = RandomizedSearchCV(params['model'], params['params'],cv=cv, n_iter=20)
rs.fit(X,y)
scores.append([model_name, dict(rs.best_params_), rs.best_score_])
data=pd.DataFrame(scores, columns=['Model','Parameters','Score'])
data
param=data['Parameters']
model = VotingClassifier(estimators=[
('XGB',GradientBoostingClassifier(**param[0])),
('RF',RandomForestClassifier(**param[1])),
],voting='hard')
accuracy=[]
scaler = StandardScaler()
skf = RepeatedStratifiedKFold(n_splits=5,n_repeats=2)
skf.get_n_splits(X,y)
for train_index, test_index in skf.split(X,y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
model.fit(X_train,y_train)
predictions=model.predict(X_test)
score=accuracy_score(y_test,predictions)
accuracy.append(score)
np.mean(accuracy)
"""
The TDS levels seem to contain some descripency since its values are on an average 40 folds more than the upper limit for safe drinking water.
The data contains almost equal number of acidic and basic pH level water samples.
92% of the data was considered Hard.
Only 2% of the water samples were safe in terms of Chloramines levels.
Only 1.8% of the water samples were safe in terms of Sulfate levels.
90.6% of the water samples had higher Carbon levels than the typical Carbon levels in drinking water (10 ppm).
76.6% of water samples were safe for drinking in terms of Trihalomethane levels in water.
90.4% of the water samples were safe for drinking in terms of the Turbidity of water samples.
The correlation coefficients between the features were very low.
Random Forest and XGBoost worked the best to train the model.
The ensemble method of using the Voting Classfier on Stratified K-folded samples gave an accuracy of >64%
"""